import numpy as np
import pandas as pd
import calendar
import datetime
import os
# Visualisation libraries
## Text
from colorama import Fore, Back, Style
from IPython.display import Image, display, Markdown, Latex, clear_output
## plotly
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import plotly.offline as py
from plotly.subplots import make_subplots
import plotly.express as px
## seaborn
import seaborn as sns
## matplotlib
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse, Polygon
from matplotlib.font_manager import FontProperties
import matplotlib.colors as mcolors
plt.style.use('seaborn-whitegrid')
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['text.color'] = 'k'
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
In this article, we work with the Instacart Market Basket Analysis Dataset from Kaggle. The dataset can downloaded from Kaggle.com or from instacart.com.
The dataset for this competition is a relational set of files describing customers' orders over time. The goal of the competition is to predict which products will be in a user's next order. The dataset is anonymized and contains a sample of over 3 million grocery orders from more than 200,000 Instacart users. For each user, we provide between 4 and 100 of their orders, with the sequence of products purchased in each order. We also provide the week and hour of the day the order was placed and a relative measure of time between orders. For more information, see the blog post accompanying its public release.
The data dictionary is available here.
orders (3.4m rows, 206k users):
order_id: order identifieruser_id: customer identifiereval_set: which evaluation set this order belongs in (see SET described below)order_number: the order sequence number for this user (1 = first, n = nth)order_dow: the day of the week the order was placed onorder_hour_of_day: the hour of the day the order was placed ondays_since_prior: days since the last order, capped at 30 (with NAs for order_number = 1)products (50k rows):
product_id: product identifierproduct_name: name of the productaisle_id: foreign keydepartment_id: foreign keyaisles (134 rows):
aisle_id: aisle identifieraisle: the name of the aisledeptartments (21 rows):
department_id: department identifierdepartment: the name of the departmentorder_products__SET (30m+ rows):
order_id: foreign keyproduct_id: foreign keyadd_to_cart_order: order in which each product was added to cartreordered: 1 if this product has been ordered by this user in the past, 0 otherwisewhere SET is one of the four following evaluation sets (eval_set in orders):
"prior": orders prior to that users most recent order (~3.2m orders)"train": training data supplied to participants (~131k orders)"test": test data reserved for machine learning competitions (~75k orders)def Header(Text, L = 100, C1 = Back.BLUE, C2 = Fore.BLUE):
print(C1 + Fore.WHITE + Style.NORMAL + Text + Style.RESET_ALL + ' ' + C2 +
Style.NORMAL + (L- len(Text) - 1)*'=' + Style.RESET_ALL)
def Line(L=100, C = Fore.BLUE): print(C + Style.NORMAL + L*'=' + Style.RESET_ALL)
PATH = 'Instacart'
Files = os.listdir(PATH)
Files.remove('sample_submission.csv')
Files_Info = pd.DataFrame()
for i in range(len(Files)):
Header(Files[i])
Temp = pd.read_csv(os.path.join(PATH, Files[i]))
display(Temp.head(5))
Files_Info = Files_Info.append(pd.DataFrame({'File':[Files[i]], 'Number of Instances': [Temp.shape[0]],
'Number of Attributes': [Temp.shape[1]]}), ignore_index = True)
filename = Files[i].split('.')[0]
globals() [filename] = pd.read_csv(os.path.join(PATH, Files[i]))
Line()
display(Files_Info.style.hide_index())
Line()
del Files, i, Temp
aisles.csv =========================================================================================
| aisle_id | aisle | |
|---|---|---|
| 0 | 1 | prepared soups salads |
| 1 | 2 | specialty cheeses |
| 2 | 3 | energy granola bars |
| 3 | 4 | instant foods |
| 4 | 5 | marinades meat preparation |
departments.csv ====================================================================================
| department_id | department | |
|---|---|---|
| 0 | 1 | frozen |
| 1 | 2 | other |
| 2 | 3 | bakery |
| 3 | 4 | produce |
| 4 | 5 | alcohol |
orders.csv =========================================================================================
| order_id | user_id | eval_set | order_number | order_dow | order_hour_of_day | days_since_prior_order | |
|---|---|---|---|---|---|---|---|
| 0 | 2539329 | 1 | prior | 1 | 2 | 8 | NaN |
| 1 | 2398795 | 1 | prior | 2 | 3 | 7 | 15.0 |
| 2 | 473747 | 1 | prior | 3 | 3 | 12 | 21.0 |
| 3 | 2254736 | 1 | prior | 4 | 4 | 7 | 29.0 |
| 4 | 431534 | 1 | prior | 5 | 4 | 15 | 28.0 |
order_products_prior.csv ===========================================================================
| order_id | product_id | add_to_cart_order | reordered | |
|---|---|---|---|---|
| 0 | 2 | 33120 | 1 | 1 |
| 1 | 2 | 28985 | 2 | 1 |
| 2 | 2 | 9327 | 3 | 0 |
| 3 | 2 | 45918 | 4 | 1 |
| 4 | 2 | 30035 | 5 | 0 |
order_products_train.csv ===========================================================================
| order_id | product_id | add_to_cart_order | reordered | |
|---|---|---|---|---|
| 0 | 1 | 49302 | 1 | 1 |
| 1 | 1 | 11109 | 2 | 1 |
| 2 | 1 | 10246 | 3 | 0 |
| 3 | 1 | 49683 | 4 | 0 |
| 4 | 1 | 43633 | 5 | 1 |
products.csv =======================================================================================
| product_id | product_name | aisle_id | department_id | |
|---|---|---|---|---|
| 0 | 1 | Chocolate Sandwich Cookies | 61 | 19 |
| 1 | 2 | All-Seasons Salt | 104 | 13 |
| 2 | 3 | Robust Golden Unsweetened Oolong Tea | 94 | 7 |
| 3 | 4 | Smart Ones Classic Favorites Mini Rigatoni Wit... | 38 | 1 |
| 4 | 5 | Green Chile Anytime Sauce | 5 | 13 |
====================================================================================================
| File | Number of Instances | Number of Attributes |
|---|---|---|
| aisles.csv | 134 | 2 |
| departments.csv | 21 | 2 |
| orders.csv | 3421083 | 7 |
| order_products_prior.csv | 32434489 | 4 |
| order_products_train.csv | 1384617 | 4 |
| products.csv | 49688 | 4 |
====================================================================================================
fig, ax = plt.subplots(nrows=3, ncols=1, figsize=(14, 6*3))
## The most ordered Products
Top_number = 15
Header('The Most Ordered Products', L = 100, C1 = Back.GREEN, C2 = Fore.GREEN)
Most_Ordered_Products = order_products_prior.groupby(['product_id'])['product_id'].agg({'count'})
Most_Ordered_Products.columns = ['Count']
Most_Ordered_Products = Most_Ordered_Products.reset_index(drop = False).sort_values(by='Count', ascending=False)
Most_Ordered_Products = pd.merge(Most_Ordered_Products, products, on='product_id', how='right')
Most_Ordered_Products = Most_Ordered_Products.reset_index(drop=True).dropna()
Most_Ordered_Products['Count'] = Most_Ordered_Products['Count'].astype(int)
display(Most_Ordered_Products.head(Top_number))
_ = sns.barplot(ax = ax[0], y='product_name', x='Count', palette='PuBu',
edgecolor='SkyBlue', hatch="///", data=Most_Ordered_Products.head(Top_number))
_ = sns.barplot(ax = ax[0], y='product_name', x='Count', facecolor = 'None',
edgecolor='k',data=Most_Ordered_Products.head(Top_number))
_ = ax[0].set_title('Top %s Ordered Products' % Top_number)
_ = ax[0].set_xlim([0, 5e5])
_ = ax[0].set_ylabel('Products')
_ = ax[0].set_xlabel('Count')
## The most Common Order Size
Top_number = 30
Header('The Most Frequent Order Size', L = 100, C1 = Back.GREEN, C2 = Fore.GREEN)
Most_Ordered = order_products_prior.groupby(['order_id'])['product_id'].agg({'count'})
Most_Ordered.columns = ['Number of Products in an Order']
Most_Ordered = Most_Ordered.reset_index(drop = False).\
groupby(['Number of Products in an Order'])['Number of Products in an Order'].agg({'count'})
Most_Ordered.columns = ['Count']
Most_Ordered = Most_Ordered.reset_index(drop = False).sort_values(by='Count', ascending=False)
display(Most_Ordered.head(Top_number).set_index('Number of Products in an Order').T)
_ = sns.barplot(ax = ax[1], x='Number of Products in an Order', y='Count', palette='OrRd',
edgecolor='DarkOrange', hatch="///", data= Most_Ordered.head(Top_number))
_ = sns.barplot(ax = ax[1], x='Number of Products in an Order', y='Count', facecolor = 'None',
edgecolor='k', data= Most_Ordered.head(Top_number))
_ = ax[1].set_title('%s Most Frequent Order Size' % Top_number)
_ = ax[1].set_ylim([0, 2.5e5])
## The Most Reordered Products
Top_number = 15
Header('The Most Reordered Products', L = 100, C1 = Back.GREEN, C2 = Fore.GREEN)
Temp = order_products_prior.loc[order_products_prior.reordered ==1]
Most_ReOrdered_Products = Temp.groupby(['product_id'])['product_id'].agg({'count'})
Most_ReOrdered_Products.columns = ['Count']
Most_ReOrdered_Products = Most_ReOrdered_Products.reset_index(drop = False).sort_values(by='Count', ascending=False)
Most_ReOrdered_Products = pd.merge(Most_ReOrdered_Products, products, on='product_id', how='right')
Most_ReOrdered_Products = Most_ReOrdered_Products.reset_index(drop=True).dropna()
Most_ReOrdered_Products['Count'] = Most_ReOrdered_Products['Count'].astype(int)
display(Most_ReOrdered_Products.head(Top_number))
_ = sns.barplot(ax = ax[2], y='product_name', x='Count', palette='PiYG',
edgecolor='SkyBlue', hatch="///", data=Most_ReOrdered_Products.head(Top_number))
_ = sns.barplot(ax = ax[2], y='product_name', x='Count', facecolor = 'None',
edgecolor='k',data=Most_ReOrdered_Products.head(Top_number))
_ = ax[2].set_title('Top %s Reordered Products' % Top_number)
_ = ax[2].set_xlim([0, 4.5e5])
_ = ax[2].set_ylabel('Products')
_ = ax[2].set_xlabel('Count')
The Most Ordered Products ==========================================================================
| product_id | Count | product_name | aisle_id | department_id | |
|---|---|---|---|---|---|
| 0 | 1 | 1852 | Chocolate Sandwich Cookies | 61 | 19 |
| 1 | 2 | 90 | All-Seasons Salt | 104 | 13 |
| 2 | 3 | 277 | Robust Golden Unsweetened Oolong Tea | 94 | 7 |
| 3 | 4 | 329 | Smart Ones Classic Favorites Mini Rigatoni Wit... | 38 | 1 |
| 4 | 5 | 15 | Green Chile Anytime Sauce | 5 | 13 |
| 5 | 6 | 8 | Dry Nose Oil | 11 | 11 |
| 6 | 7 | 30 | Pure Coconut Water With Orange | 98 | 7 |
| 7 | 8 | 165 | Cut Russet Potatoes Steam N' Mash | 116 | 1 |
| 8 | 9 | 156 | Light Strawberry Blueberry Yogurt | 120 | 16 |
| 9 | 10 | 2572 | Sparkling Orange Juice & Prickly Pear Beverage | 115 | 7 |
| 10 | 11 | 104 | Peach Mango Juice | 31 | 7 |
| 11 | 12 | 246 | Chocolate Fudge Layer Cake | 119 | 1 |
| 12 | 13 | 9 | Saline Nasal Mist | 11 | 11 |
| 13 | 14 | 17 | Fresh Scent Dishwasher Cleaner | 74 | 17 |
| 14 | 15 | 4 | Overnight Diapers Size 6 | 56 | 18 |
The Most Frequent Order Size =======================================================================
| Number of Products in an Order | 5 | 6 | 4 | 7 | 3 | 8 | 2 | 9 | 10 | 1 | ... | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Count | 228330 | 227675 | 222081 | 220006 | 207027 | 203374 | 186993 | 184347 | 165550 | 156748 | ... | 41863 | 36368 | 31672 | 27065 | 23613 | 20283 | 17488 | 15102 | 13033 | 11251 |
1 rows × 30 columns
The Most Reordered Products ========================================================================
| product_id | Count | product_name | aisle_id | department_id | |
|---|---|---|---|---|---|
| 0 | 1 | 1136 | Chocolate Sandwich Cookies | 61 | 19 |
| 1 | 2 | 12 | All-Seasons Salt | 104 | 13 |
| 2 | 3 | 203 | Robust Golden Unsweetened Oolong Tea | 94 | 7 |
| 3 | 4 | 147 | Smart Ones Classic Favorites Mini Rigatoni Wit... | 38 | 1 |
| 4 | 5 | 9 | Green Chile Anytime Sauce | 5 | 13 |
| 5 | 6 | 3 | Dry Nose Oil | 11 | 11 |
| 6 | 7 | 12 | Pure Coconut Water With Orange | 98 | 7 |
| 7 | 8 | 83 | Cut Russet Potatoes Steam N' Mash | 116 | 1 |
| 8 | 9 | 82 | Light Strawberry Blueberry Yogurt | 120 | 16 |
| 9 | 10 | 1304 | Sparkling Orange Juice & Prickly Pear Beverage | 115 | 7 |
| 10 | 11 | 59 | Peach Mango Juice | 31 | 7 |
| 11 | 12 | 126 | Chocolate Fudge Layer Cake | 119 | 1 |
| 12 | 13 | 3 | Saline Nasal Mist | 11 | 11 |
| 13 | 14 | 2 | Fresh Scent Dishwasher Cleaner | 74 | 17 |
| 14 | 15 | 1 | Overnight Diapers Size 6 | 56 | 18 |
Header('Orders Distributions (Week)', L = 100)
Orders_Distributions_Week = orders.groupby(['order_dow'])['order_dow'].agg({'count'})
Orders_Distributions_Week.index = calendar.day_name[0:7]
Orders_Distributions_Week.reset_index(inplace = True, drop = False)
Orders_Distributions_Week.columns =['Day of Week', 'Count']
display(Orders_Distributions_Week.set_index('Day of Week').T)
Header('Orders Distributions (Hours)', L = 100)
Orders_Distributions_Hours = orders.groupby(['order_hour_of_day'])['order_hour_of_day'].agg({'count'}).reset_index(drop = False)
Orders_Distributions_Hours.columns = ['Hour of Day', 'Count']
display(Orders_Distributions_Hours.set_index('Hour of Day').T)
fig = plt.figure(figsize=(13, 6), constrained_layout=True)
ax = np.zeros(2, dtype = 'object')
gs = fig.add_gridspec(1, 3)
ax[0] = fig.add_subplot(gs[0])
ax[1] = fig.add_subplot(gs[1:])
# Left Plot
_ = sns.barplot(ax = ax[0], y='Count', x='Day of Week', palette='summer',
edgecolor='LimeGreen', hatch="O", data= Orders_Distributions_Week)
_ = sns.barplot(ax = ax[0], y='Count', x='Day of Week', facecolor = 'None',
edgecolor='k',data= Orders_Distributions_Week)
_ = ax[0].set_xticklabels(ax[0].get_xticklabels(), rotation=90)
_ = ax[0].set_ylim([0, 7e5])
# Right Plot
_ = sns.barplot(ax = ax[1], y='Count', x='Hour of Day', palette='cool',
edgecolor='RoyalBlue', hatch="*", data= Orders_Distributions_Hours)
_ = sns.barplot(ax = ax[1], y='Count', x='Hour of Day', facecolor = 'None',
edgecolor='k',data= Orders_Distributions_Hours)
_ = ax[1].set_ylim([0, 3e5])
fig.tight_layout()
Orders Distributions (Week) ========================================================================
| Day of Week | Monday | Tuesday | Wednesday | Thursday | Friday | Saturday | Sunday |
|---|---|---|---|---|---|---|---|
| Count | 600905 | 587478 | 467260 | 436972 | 426339 | 453368 | 448761 |
Orders Distributions (Hours) =======================================================================
| Hour of Day | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Count | 22758 | 12398 | 7539 | 5474 | 5527 | 9569 | 30529 | 91868 | 178201 | 257812 | ... | 283042 | 283639 | 272553 | 228795 | 182912 | 140569 | 104292 | 78109 | 61468 | 40043 |
1 rows × 24 columns
Most orders take place on Mondays and Tuesdays. Besides, between 9:00 AM and 5:00 PM, most orders take place daily.
Orders_Distributions_Total = (orders.pivot_table(index='order_dow', columns='order_hour_of_day',
values='order_id', aggfunc='count').fillna(0)/1000).round(2)
Orders_Distributions_Total.index=calendar.day_name[0:7]
Orders_Distributions_Total.T.index.name = 'Hour of Day'
# display(Orders_Distributions_Total)
fig, ax = plt.subplots(figsize=(15,6))
_ = sns.heatmap(Orders_Distributions_Total, annot=True, cmap =sns.color_palette("RdYlGn", n_colors=10),
annot_kws={"size": 12}, linewidths=.5, ax=ax, vmin=0, vmax=60,
cbar_kws={ 'label': 'Number of Orders $\\times$ $10^3$', "aspect":30, "shrink": .5})
_ = ax.set_ylabel('Week Days')
_ = ax.set_xlabel('Hour of the day')
_ = ax.set_aspect(1)
_ = fig.tight_layout()
Between 9:00 AM and 5:00 PM, most orders take place on Mondays and Tuesdays.
Days_since_prior_order_summary = orders.days_since_prior_order.value_counts().sort_index().to_frame('Count').reset_index()
Days_since_prior_order_summary.columns = ['Days Since Prior Order', 'Count']
Days_since_prior_order_summary['Days Since Prior Order'] = Days_since_prior_order_summary['Days Since Prior Order'].astype(int)
#
fig = plt.figure(figsize=(13.5, 6))
ax = np.zeros(2, dtype = 'object')
gs = fig.add_gridspec(1, 3)
ax[0] = fig.add_subplot(gs[0])
ax[1] = fig.add_subplot(gs[1:])
# Left Plot
_ = ax[0].boxplot(orders.days_since_prior_order.dropna(), 0, 'rs', 0)
_ = ax[0].set_xlabel('Days since prior order')
_ = ax[0].set_title('Distribution of The Number\nof Days Since Prior Order')
_ = ax[0].set_xlim([0, 20])
# Right Plot
_ = sns.barplot(ax = ax[1], y='Count', x='Days Since Prior Order', palette='summer',
edgecolor='DarkGreen', hatch="..", data= Days_since_prior_order_summary)
_ = sns.barplot(ax = ax[1], y='Count', x='Days Since Prior Order', facecolor = 'None',
edgecolor='k', data= Days_since_prior_order_summary)
_ = ax[1].set_ylim([0, 4e5])
_ = ax[1].set_xlabel('Days Since Prior Order')
_ = ax[1].set_title('Distribution of The Number\nof Days Since Prior Order')
fig.tight_layout()
Usually, customers order from 5 days to 15 days after the prior order. The majority of orders take place after a week and a month after the prior order.
Products_Detailed = pd.merge(left =pd.merge(left=products, right= departments, how='left'), right=aisles, how='left')
Products_Detailed.columns = [x.replace('_',' ').title().replace('Id','ID') for x in Products_Detailed.columns.tolist()]
Products_Detailed['Department'] = Products_Detailed['Department'].map(lambda x: x.title())
Products_Detailed['Aisle'] = Products_Detailed['Aisle'].map(lambda x: x.title())
Products_Detailed.head(10).style.hide_index()
| Product ID | Product Name | Aisle ID | Department ID | Department | Aisle |
|---|---|---|---|---|---|
| 1 | Chocolate Sandwich Cookies | 61 | 19 | Snacks | Cookies Cakes |
| 2 | All-Seasons Salt | 104 | 13 | Pantry | Spices Seasonings |
| 3 | Robust Golden Unsweetened Oolong Tea | 94 | 7 | Beverages | Tea |
| 4 | Smart Ones Classic Favorites Mini Rigatoni With Vodka Cream Sauce | 38 | 1 | Frozen | Frozen Meals |
| 5 | Green Chile Anytime Sauce | 5 | 13 | Pantry | Marinades Meat Preparation |
| 6 | Dry Nose Oil | 11 | 11 | Personal Care | Cold Flu Allergy |
| 7 | Pure Coconut Water With Orange | 98 | 7 | Beverages | Juice Nectars |
| 8 | Cut Russet Potatoes Steam N' Mash | 116 | 1 | Frozen | Frozen Produce |
| 9 | Light Strawberry Blueberry Yogurt | 120 | 16 | Dairy Eggs | Yogurt |
| 10 | Sparkling Orange Juice & Prickly Pear Beverage | 115 | 7 | Beverages | Water Seltzer Sparkling Water |
Products_by_Department = Products_Detailed.groupby('Department')['Product ID'].count().\
to_frame('Total Products').reset_index(drop = False)
Products_by_Department['Percentage'] = np.round(100* Products_by_Department['Total Products']\
/ Products_by_Department['Total Products'].sum(),2)
Products_by_Department.sort_values(by='Total Products', ascending=False, inplace=True)
Colors = ['LightGreen']
LC = 'DarkGreen'
fig = px.bar(Products_by_Department, x = 'Department', y='Total Products', text = 'Percentage',
color_discrete_sequence= Colors,
hover_data= Products_by_Department.columns)
fig.update_layout(plot_bgcolor= 'white', barmode='stack')
fig.update_traces(marker_line_color= LC, marker_line_width=1, opacity=1)
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True,
showgrid=True, gridwidth=1, gridcolor='Lightgray', range=[0, 7e3])
fig.update_layout(title={'text': 'Total Products in Each Departments', 'x':0.5, 'y':0.94,
'xanchor': 'center', 'yanchor': 'top'})
fig.show()
It can be seen that Personal Care and Snacks are the best seller departments.
def Search_List(Key, List): return [s for s in List if Key in s]
myColors = list(mcolors.CSS4_COLORS.keys())
Temp = []
Temp.extend(Search_List('gray', myColors))
Temp.extend(Search_List('grey', myColors))
Temp.extend(Search_List('black', myColors))
myColors = list(set(myColors)-set(Temp))
#
Products_by_Aisle = Products_Detailed.groupby(["Department", "Aisle"])["Product ID"].count().\
to_frame('Total Products').reset_index(drop = False)
Products_by_Aisle['Percentage'] = np.round(100* Products_by_Aisle['Total Products']\
/ Products_by_Aisle['Total Products'].sum(),2)
Products_by_Aisle = Products_by_Aisle.sort_values(by='Total Products', ascending=False)
Products_by_Aisle = Products_by_Aisle.loc[(Products_by_Aisle['Department'] != 'Missing') &
(Products_by_Aisle['Aisle'] != 'Missing') &
(Products_by_Aisle['Aisle'] != 'Other')]
for d in Products_by_Aisle['Department'].unique().tolist():
Products_by_Aisle.loc[Products_by_Aisle['Department'] == d , 'Percentage'] = \
np.round(100* Products_by_Aisle.loc[Products_by_Aisle['Department'] == d , 'Total Products']\
/Products_by_Aisle.loc[Products_by_Aisle['Department'] == d , 'Total Products'].sum(),2)
# Plot
font = FontProperties()
font.set_weight('bold')
Temp = list(np.linspace(0, 100, 6))
m = Products_by_Aisle['Percentage'].max()
for j in range(len(Temp)-1):
if Temp[j] <= m < Temp[j+1]:
m = Temp[j+1]
break
N = len(Products_by_Aisle.Department.unique())
fpr = 4; C = 0; Ind = list(np.arange(0, N, fpr))
fig, ax = plt.subplots(nrows=int(np.ceil(N/fpr)), ncols=fpr, figsize=(16,30), sharey = True,
gridspec_kw = dict(hspace=1.4, wspace=.0))
ax = ax.ravel()
_ = fig.delaxes(ax[-1])
Palette = ['Purples', 'Blues', 'Greens', 'Oranges']*5
EdgeColor = ['Indigo', 'Navy', 'ForestGreen','Salmon']*5
Hatch = ['//', 'O', '--', '\\']*5
for (aisle, PBA), ax in zip(Products_by_Aisle.groupby(['Department']), ax.flatten()):
_ = sns.barplot(PBA['Aisle'], PBA['Percentage'] , ax=ax, palette= Palette[C], edgecolor=EdgeColor[C], hatch= Hatch[C])
_ = sns.barplot(PBA['Aisle'], PBA['Percentage'] , ax=ax, facecolor = 'None', edgecolor='k')
if C in Ind:
_ = ax.set(xlabel = 'Aisles', ylabel='Percentage', ylim = [0, 100])
else:
_ = ax.set(xlabel = 'Aisles', ylabel=None, ylim = [0, m])
_ = ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
_ = ax.set_title(aisle)
C +=1
_ = fig.suptitle(t = 'Product Distributions', y = .91,
fontproperties=font, fontsize = 16)
Colors = myColors.copy()
LC = 'Black'
fig = px.bar(Products_by_Aisle, x = 'Aisle', y='Total Products', color = 'Department',
text = 'Total Products', color_discrete_sequence= Colors, hover_data= ['Department', 'Total Products'])
fig.update_layout(plot_bgcolor= 'white', barmode='stack')
fig.update_traces(marker_line_color= LC, marker_line_width=1, opacity=1)
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True,
showgrid=True, gridwidth=1, gridcolor='Lightgray', range=[0, 1.4e3])
fig.update_layout(title={'text': 'Total Products in Each Aisles', 'x':0.5, 'y':0.94,
'xanchor': 'center', 'yanchor': 'top'})
fig.show()
Orders_Detailed = orders[['user_id', 'order_id']].merge(order_products_train[['order_id', 'product_id']],
how='inner', left_on='order_id', right_on='order_id')
Orders_Detailed.columns = [x.replace('_',' ').title().replace('Id','ID') for x in Orders_Detailed.columns.tolist()]
Orders_Detailed = Orders_Detailed.merge(Products_Detailed, how='inner', left_on='Product ID', right_on='Product ID')
#
Orders_by_Department = Orders_Detailed.groupby('Department')['Order ID'].count().\
to_frame('Total Orders').reset_index(drop = False)
Orders_by_Department['Percentage'] = np.round(100* Orders_by_Department['Total Orders']\
/ Orders_by_Department['Total Orders'].sum(),2)
Colors = ['MediumVioletRed']
LC = 'Indigo'
fig = px.bar(Orders_by_Department, x = 'Department', y= 'Percentage', text = 'Total Orders', color_discrete_sequence= Colors,
hover_data = Orders_by_Department.columns)
fig.update_layout(plot_bgcolor= 'white', barmode='stack')
fig.update_traces(marker_line_color= LC, marker_line_width=1, opacity=1)
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True,
showgrid=True, gridwidth=1, gridcolor='Lightgray', range=[0, 30])
fig.update_layout(title={'text': 'Best Selling Departments', 'x':0.5, 'y':0.94,
'xanchor': 'center', 'yanchor': 'top'})
fig.show()
Products_by_Aisle = Orders_Detailed.groupby(["Department", "Aisle"])["Product ID"].count().\
to_frame('Total Products').reset_index(drop = False)
Products_by_Aisle['Percentage'] = np.round(100* Products_by_Aisle['Total Products']\
/ Products_by_Aisle['Total Products'].sum(),2)
Products_by_Aisle = Products_by_Aisle.sort_values(by='Total Products', ascending=False)
Products_by_Aisle.head().style.hide_index()
Products_by_Aisle = Products_by_Aisle.loc[(Products_by_Aisle['Department'] != 'Missing') &
(Products_by_Aisle['Aisle'] != 'Missing') &
(Products_by_Aisle['Aisle'] != 'Other')]
for d in Products_by_Aisle['Department'].unique().tolist():
Products_by_Aisle.loc[Products_by_Aisle['Department'] == d , 'Percentage'] = \
np.round(100* Products_by_Aisle.loc[Products_by_Aisle['Department'] == d , 'Total Products']\
/Products_by_Aisle.loc[Products_by_Aisle['Department'] == d , 'Total Products'].sum(),2)
# Plot
font = FontProperties()
font.set_weight('bold')
Temp = list(np.linspace(0, 100, 6))
m = Products_by_Aisle['Percentage'].max()
for j in range(len(Temp)-1):
if Temp[j] <= m < Temp[j+1]:
m = Temp[j+1]
break
N = len(Products_by_Aisle.Department.unique())
fpr = 4; C = 0;
Ind = list(np.arange(0, N, fpr))
fig, ax = plt.subplots(nrows=int(np.ceil(N/fpr)), ncols=fpr, figsize=(16,30), sharey = True,
gridspec_kw = dict(hspace=1.4, wspace=.0))
ax = ax.ravel()
_ = fig.delaxes(ax[-1])
Palette = ['Purples', 'Blues', 'Greens', 'Oranges']*5
EdgeColor = ['Indigo', 'Navy', 'ForestGreen','Salmon']*5
Hatch = ['//', 'O', '--', '\\']*5
for (aisle, PBA), ax in zip(Products_by_Aisle.groupby(['Department']), ax.flatten()):
_ = sns.barplot(PBA['Aisle'], PBA['Percentage'] , ax=ax, palette= Palette[C], edgecolor=EdgeColor[C], hatch= Hatch[C])
_ = sns.barplot(PBA['Aisle'], PBA['Percentage'] , ax=ax, facecolor = 'None', edgecolor='k')
if C in Ind:
_ = ax.set(xlabel = 'Aisles', ylabel='Percentage', ylim = [0, 100])
else:
_ = ax.set(xlabel = 'Aisles', ylabel=None, ylim = [0, m])
_ = ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
_ = ax.set_title(aisle)
C +=1
_ = fig.suptitle(t = 'Product Sale Distributions', y = .91,
fontproperties=font, fontsize = 16)
##
Colors = myColors.copy()
LC = 'Black'
fig = px.bar(Products_by_Aisle, x = 'Aisle', y='Total Products', color = 'Department',
text = 'Total Products', color_discrete_sequence= Colors,
hover_data= ['Department', 'Total Products'])
fig.update_layout(plot_bgcolor= 'white', barmode='stack')
fig.update_traces(marker_line_color= LC, marker_line_width=1, opacity=1)
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True,
showgrid=True, gridwidth=1, gridcolor='Lightgray', range=[0, 2e5])
fig.update_layout(title={'text': 'Best Selling Aisles', 'x':0.5, 'y':0.94,
'xanchor': 'center', 'yanchor': 'top'})
fig.show()